Load libraries
#### Load packages ####
library(camprotR)
library(ggplot2)
library(tidyverse)
library(MSnbase)
library(biobroom)
Read in the PSM data
psm_res <- readRDS('../results/psm_res.rds')
psm_res %>% names() %>% lapply(function(x){
psm_res[[x]] %>%
fData() %>%
mutate(method=x)
}) %>%
do.call(what='rbind') %>%
group_by(method) %>%
summarise(median_sn=10^median(log10(Average.Reporter.SN), na.rm=TRUE))
`summarise()` ungrouping output (override with `.groups` argument)
129.15/42
[1] 3.075
p <- psm_res %>% names() %>% lapply(function(x){
psm_res[[x]] %>%
fData() %>%
mutate(method=x)
}) %>%
do.call(what='rbind') %>%
ggplot(aes(Average.Reporter.SN, colour=method)) +
geom_density() +
theme_camprot()
print(p)

print(p + scale_x_log10())

print(p + aes(Isolation.Interference.in.Percent))

Plotting the proportion of missing values
psm_res %>% names() %>% lapply(function(x){
all <- psm_res[[x]]
hs <- all[fData(all)$species=='H.sapiens']
sc <- all[fData(all)$species=='S.cerevisiae']
slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
for(slice in names(slices)){
p <- slices[[slice]] %>% plot_missing_SN() +
ggtitle(sprintf('%s - %s', x, slice))
print(p)
p <- slices[[slice]] %>% plot_missing_SN_per_sample() +
ggtitle(sprintf('%s - %s', x, slice))
print(p)
}
return(NULL)
})
[[1]]
NULL
[[2]]
NULL












OK, so essentially all the missing values are restricted to PSMs with low (<20) Signal:Noise ratios.
source('./R/get_quant_vs_mean.R')
quant_vs_mean <- psm_res %>% lapply(get_quant_vs_mean)
quant_vs_mean <- quant_vs_mean %>% lapply(function(x){
x %>%
mutate(binned_interference=Hmisc::cut2(
Isolation.Interference.in.Percent, cuts=c(0,1,5,10,seq(20,100,20))),
binned_q=Hmisc::cut2(Percolator.q.Value, cuts=c(0, 0.001, 0.002, 0.005, 0.01)),
binned_average_sn=Hmisc::cut2(Average.Reporter.SN, cuts=c(0,10,20,30,40,60,100)),
binned_intensity=Hmisc::cut2(intensity, cuts=c(0,10,20,30,40,60,100)))
})
quant_vs_mean %>% lapply(dim)
$`AGC: 2E5`
[1] 1980560 23
$`AGC: 5E4`
[1] 2166658 23
exp_design <- pData(psm_res$`AGC: 2E5`) %>%
select(condition, S.cerevisiae=yeast, H.sapiens=human) %>%
unique()
sc_spikes <- exp_design$S.cerevisiae
hs_spikes <- exp_design$H.sapiens
get_ground_truth <- function(sc_spikes, hs_spikes, ix_1, ix_2){
comparison <- sprintf('%s vs %s', sc_spikes[ix_2], sc_spikes[ix_1])
hs_ground_truth <- hs_spikes[ix_2]/hs_spikes[ix_1]
sc_ground_truth <- sc_spikes[ix_2]/sc_spikes[ix_1]
return(c(comparison, hs_ground_truth, sc_ground_truth))
}
library(gtools)
expected <- apply(permutations(n=3,r=2), 1, function(x){
get_ground_truth(sc_spikes, hs_spikes, x[1], x[2])
}) %>% t() %>% data.frame() %>%
setNames(c('comparison', 'H.sapiens', 'S.cerevisiae')) %>%
mutate_at(vars(S.cerevisiae,
H.sapiens),
funs(as.numeric)) %>%
pivot_longer(-comparison, names_to='species', values_to='expected')
print(expected)
positive_comparisons <- expected %>% filter(species=='S.cerevisiae', expected>1) %>%
pull(comparison)
quant_vs_mean %>% names() %>% lapply(function(x){
for(sp in c("S.cerevisiae", "H.sapiens")){
p <- quant_vs_mean[[x]] %>%
filter(species==sp, comparison %in% positive_comparisons) %>%
ggplot(aes(binned_interference, diff, colour=below_notch)) +
geom_violin() +
theme_camprot(base_size=10) +
theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
facet_wrap(~comparison, scales='free') +
geom_hline(aes(yintercept=log2(expected)),
data=expected[(expected$species==sp &
expected$comparison %in% positive_comparisons),],
colour='grey', linetype=2) +
xlab('Interference') +
ylab('Difference in intensity') +
theme(aspect.ratio=.3) +
ggtitle(x)
print(p)
}
p <- quant_vs_mean[[x]] %>%
filter(species=='S.cerevisiae', Isolation.Interference.in.Percent<=60,
comparison %in% positive_comparisons) %>%
ggplot(aes(diff, colour=binned_interference)) +
geom_density() +
theme_camprot(base_size=10) +
theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
facet_grid(below_notch~comparison, scales='free') +
geom_vline(aes(xintercept=log2(expected)),
data=expected[(expected$species=='S.cerevisiae' &
expected$comparison %in% positive_comparisons),],
colour='grey', linetype=2) +
xlab('Difference in intensity') +
ylab('Density') +
xlim(-6,3) +
theme(aspect.ratio=2) +
ggtitle(x)
print(p)
})
[[1]]
[[2]]








quant_vs_mean %>% names() %>% lapply(function(x){
p <- quant_vs_mean[[x]] %>%
filter(Isolation.Interference.in.Percent<=50, # no need to consider interference>=50%
comparison %in% positive_comparisons) %>%
filter(species=='S.cerevisiae', !below_notch) %>%
ggplot(aes(diff, colour=binned_intensity)) +
geom_density() +
theme_camprot(base_size=15) +
theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
facet_grid(binned_interference~comparison, scales='free') +
geom_vline(aes(xintercept=log2(expected)),
data=expected[(expected$species=='S.cerevisiae' &
expected$comparison %in% positive_comparisons),],
colour=get_cat_palette(1), linetype=2) +
ylab('Density') +
xlab('Difference in intensity') +
xlim(-6,3) +
ggtitle(x) +
scale_colour_discrete(name='Intensity')
print(p)
print(p + aes(colour=binned_interference) + facet_grid(binned_average_sn~comparison) +
scale_colour_discrete(name='Interference (%)'))
print(p + aes(colour=binned_q) + scale_colour_discrete(name='Percolator Q'))
print(p + aes(colour=binned_interference) + facet_grid(binned_q~comparison) +
scale_colour_discrete(name='Interference (%)'))
return(NULL)
})
[[1]]
NULL
[[2]]
NULL








quant_vs_mean %>% names() %>% lapply(function(x){
p <- quant_vs_mean[[x]] %>%
filter(Isolation.Interference.in.Percent<=60) %>% # no need to consider interference>=60%
filter(species=='S.cerevisiae', !below_notch, comparison=='6 vs 1') %>%
group_by(binned_interference, binned_intensity) %>%
summarise(median_diff=2^median(diff, na.rm=TRUE), n=length(diff)) %>%
ggplot(aes(binned_interference, binned_intensity, fill=median_diff)) +
geom_tile(colour='grey') +
theme_camprot(base_size=15) +
scale_fill_gradient(high=get_cat_palette(2)[2],
low='white',
limits=c(0, 6), name='Observed\nfold change') +
theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
xlab('Binned interference') +
ylab('Binned intensity') +
ggtitle(x)
print(p + geom_text(aes(label=round(median_diff, 1)), size=3))
print(p +
aes(fill=n) +
scale_fill_gradient(high=get_cat_palette(3)[3],
low='white') +
geom_text(aes(label=n), size=3) )
})
`summarise()` regrouping output by 'binned_interference' (override with `.groups` argument)
[[1]]
[[2]]






quant_vs_mean %>% names() %>% lapply(function(x){
quant_vs_mean[[x]] %>%
filter(Isolation.Interference.in.Percent<=60) %>% # no need to consider interference>=60%
filter(species=='S.cerevisiae', !below_notch, comparison=='6 vs 1') %>%
group_by(binned_q, binned_interference, binned_intensity) %>%
summarise(median_diff=2^median(diff, na.rm=TRUE)) %>%
ggplot(aes(binned_interference, binned_intensity, fill=median_diff)) +
geom_tile(colour='grey') +
facet_wrap(~binned_q) +
theme_camprot(base_size=15) +
scale_fill_gradient2(high=get_cat_palette(2)[2],
low=get_cat_palette(2)[1],
mid='white', midpoint=0,
limits=c(-1, 6), name='Observed\nfold change') +
theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1),
panel.background=element_rect(fill="grey")) +
xlab('Binned interference') +
ylab('Binned Intensity') +
ggtitle(x)
})
`summarise()` regrouping output by 'binned_q', 'binned_interference' (override with `.groups` argument)
`summarise()` regrouping output by 'binned_q', 'binned_interference' (override with `.groups` argument)
[[1]]
[[2]]


quant_vs_mean %>% names() %>% lapply(function(x){
p <- quant_vs_mean[[x]] %>%
select(id, species, binned_q, binned_average_sn, binned_interference) %>%
unique() %>%
group_by(species, binned_q, binned_average_sn, binned_interference) %>%
tally() %>%
ggplot(aes(binned_interference, n)) +
geom_bar(stat='identity') +
facet_wrap(~species, scales='free') +
theme_camprot(base_size=15) +
theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
ggtitle(x)
print(p)
print(p + aes(binned_q))
print(p + aes(binned_average_sn) +
xlab('Signal/Noise'))
return(NULL)
})
[[1]]
NULL
[[2]]
NULL






Based on the above, I’m going to use the following thresholds:
- Isolation interference <= 10%
- Percolator Q value <= 0.001
For now, we won’t filer using SN since we want to explore the impact of the notch on fold changes in more detail first
First though, let’s filter by Percolator Q value or Isolation interence alone and check how isolation interference affects PSM-level fold change estimates.
quant_vs_mean %>% names() %>% lapply(function(x){
to_plot_q_flt <- quant_vs_mean[[x]] %>%
filter(Percolator.q.Value<=0.005) %>%
filter(species=='S.cerevisiae') %>%
filter(Isolation.Interference.in.Percent<50) %>%
arrange(Isolation.Interference.in.Percent)
to_plot_interference_flt <- quant_vs_mean[[x]] %>%
filter(Isolation.Interference.in.Percent<=10) %>%
filter(species=='S.cerevisiae') %>%
arrange(Percolator.q.Value)
p <- to_plot_q_flt %>%
ggplot(aes(log2(intensity), diff)) +
geom_point(size=0.1, alpha=0.1, colour='grey80') +
theme_camprot(base_size=12) +
facet_wrap(~comparison, scales='free_y') +
geom_hline(aes(yintercept=log2(expected)),
data=expected[expected$species=='S.cerevisiae',],
colour='black', linetype=2) +
xlab('Tag intensity (log2)') +
ylab('Difference in intensity (log2)') +
scale_colour_manual(values=c(get_cat_palette(7)),
name='Isolation interference (%)') +
ggtitle(x)
print(p)
print(p + geom_smooth(aes(colour=binned_interference), se=FALSE, size=0.5))
print(p %+% to_plot_interference_flt +
geom_smooth(aes(colour=binned_q), se=FALSE, size=0.5) +
scale_colour_manual(values=c(get_cat_palette(7)),
name='Percolator Q value'))
return(NULL)
})
[[1]]
NULL
[[2]]
NULL






Let’s plot the intensity vs difference in intensity for all comparisons and filtering schemes.
quant_vs_mean %>% names() %>% lapply(function(x){
tmp_data <- quant_vs_mean[[x]] %>%
filter(species!='mixed', !comparison %in% positive_comparisons)
p <- tmp_data %>%
ggplot(aes(log2(intensity), diff)) +
geom_point(size=0.05, alpha=0.05, colour='grey10') +
geom_density2d(size=0.2, colour=get_cat_palette(2)[2]) +
theme_camprot(base_size=12) +
facet_grid(species~comparison, scales='free_y') +
geom_hline(aes(yintercept=log2(expected)),
data=expected[!expected$comparison %in% positive_comparisons,],
colour='black', linetype=2) +
xlab('Tag intensity (log2)') +
ylab('Difference in intensity (log2)') +
ggtitle(x) +
coord_cartesian(ylim=c(-4,4))
print(p)
stop()
print(p %+% (tmp_data %>%
filter(Percolator.q.Value<=0.005)))
print(p %+% (tmp_data %>%
filter(Isolation.Interference.in.Percent<=10)))
print(p %+% (tmp_data %>%
filter(Percolator.q.Value<=0.005, Isolation.Interference.in.Percent<=10)))
print(p %+% (tmp_data %>%
filter(Percolator.q.Value<=0.005, Isolation.Interference.in.Percent<=10, Average.Reporter.SN>10)))
return(NULL)
})
Error in FUN(X[[i]], ...) :

Now, let’s filter the PSMs against these thresholds.
psm_res_flt <- psm_res %>% lapply(function(x){
out <- filter_TMT_PSMs(x, inter_thresh=20, sn_thresh=0)
out <- out[fData(out)$Percolator.q.Value<=0.001,]
camprotR:::message_parse(fData(out),
'Master.Protein.Accessions',
"Percolator Q value filtering")
out
})
Filtering PSMs...
99650 features found from 10512 master proteins => No quant filtering
72364 features found from 9674 master proteins => Co-isolation filtering
72364 features found from 9674 master proteins => S:N ratio filtering
56814 features found from 8696 master proteins => Percolator Q value filtering
Filtering PSMs...
109197 features found from 10974 master proteins => No quant filtering
77629 features found from 10061 master proteins => Co-isolation filtering
77629 features found from 10061 master proteins => S:N ratio filtering
62610 features found from 9164 master proteins => Percolator Q value filtering
psm_res_flt_sn <- psm_res_flt %>% lapply(function(x){
out <- filter_TMT_PSMs(x, inter_thresh=20, sn_thresh=10)
out
})
Filtering PSMs...
56814 features found from 8696 master proteins => No quant filtering
56814 features found from 8696 master proteins => Co-isolation filtering
55292 features found from 8619 master proteins => S:N ratio filtering
Filtering PSMs...
62610 features found from 9164 master proteins => No quant filtering
62610 features found from 9164 master proteins => Co-isolation filtering
57787 features found from 8965 master proteins => S:N ratio filtering
psm_res %>% names() %>% lapply(function(x){
datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
for(dataset in names(datasets)){
all <- datasets[[dataset]][[x]]
hs <- all[fData(all)$species=='H.sapiens']
sc <- all[fData(all)$species=='S.cerevisiae']
slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
for(slice in names(slices)){
p <- slices[[slice]] %>% plot_TMT_notch() +
ggtitle(sprintf('%s - %s - %s', x, slice, dataset))
print(p)
}
}
return(NULL)
})
[[1]]
NULL
[[2]]
NULL


















Per-tag notch plots
psm_res %>% names() %>% lapply(function(x){
datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
for(dataset in names(datasets)){
all <- datasets[[dataset]][[x]]
hs <- all[fData(all)$species=='H.sapiens']
sc <- all[fData(all)$species=='S.cerevisiae']
slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
for(slice in names(slices)){
p <- slices[[slice]] %>% plot_TMT_notch(facet_by_sample=TRUE) +
ggtitle(sprintf('%s - %s - %s', x, slice, dataset))
print(p)
}
}
return(NULL)
})
[[1]]
NULL
[[2]]
NULL


















Tallies for fraction sub-notch PSMs per protein
psm_res %>% names() %>% lapply(function(x){
datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
for(dataset in names(datasets)){
all <- datasets[[dataset]][[x]]
hs <- all[fData(all)$species=='H.sapiens']
sc <- all[fData(all)$species=='S.cerevisiae']
slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
for(slice in names(slices)){
notch_per_protein <- get_notch_per_protein(slices[[slice]]) %>%
filter(fraction_below>0)
p <- plot_fraction_below_notch_per_prot(notch_per_protein) +
ggtitle(sprintf('%s - %s - %s', x, slice, dataset))
print(p)
}
}
return(NULL)
})
[[1]]
NULL
[[2]]
NULL


















Missing values frequencies.
psm_res %>% names() %>% lapply(function(x){
datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
for(dataset in names(datasets)){
all <- datasets[[dataset]][[x]]
hs <- all[fData(all)$species=='H.sapiens']
sc <- all[fData(all)$species=='S.cerevisiae']
slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
for(slice in names(slices)){
plotNA(slices[[slice]], pNA = 0)
}
}
return(NULL)
})
[[1]]
NULL
[[2]]
NULL


















Save out objects for downstream notebooks
saveRDS(quant_vs_mean, '../results/quant_vs_mean.rds')
saveRDS(psm_res_flt, '../results/psm_res_flt.rds')
saveRDS(psm_res_flt_sn, '../results/psm_res_flt_sn.rds')
saveRDS(expected, '../results/expected.rds')
---
title: 'Filter PSMs'
author:
  - name: "Tom Smith"
    affiliation: "Cambridge Centre for Proteomics"
date: "`r format(Sys.time(), '%d %B, %Y')`"
abstract: | 
  Here, we filter the PSM-level PD output, with thresholds informed by missing values,
  notch prominence and observed fold changes vs ground truths.
output:
  pdf_document:
  html_notebook: default
geometry: margin=1in
fontfamily: mathpazo
fontsize: 11pt
---

Load libraries
```{r setup, message=FALSE}

#### Load packages ####
library(camprotR)
library(ggplot2)
library(tidyverse)
library(MSnbase)
library(biobroom)
```

Read in the PSM data
```{r}
psm_res <- readRDS('../results/psm_res.rds')
```

```{r}
psm_res %>% names() %>% lapply(function(x){
  psm_res[[x]] %>%
    fData() %>%
    mutate(method=x)
}) %>%
  do.call(what='rbind') %>%
  group_by(method) %>%
  summarise(median_sn=10^median(log10(Average.Reporter.SN), na.rm=TRUE))
129.15/42
```

```{r}
p <- psm_res %>% names() %>% lapply(function(x){
  psm_res[[x]] %>%
    fData() %>%
    mutate(method=x)
}) %>%
  do.call(what='rbind') %>%
  ggplot(aes(Average.Reporter.SN, colour=method)) +
    geom_density() +
    theme_camprot()

print(p)
print(p + scale_x_log10())

print(p + aes(Isolation.Interference.in.Percent))
```

Plotting the proportion of missing values 
```{r}

psm_res %>% names() %>% lapply(function(x){
  
  all <- psm_res[[x]]
  hs <- all[fData(all)$species=='H.sapiens']
  sc <- all[fData(all)$species=='S.cerevisiae']
  
  slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
  for(slice in names(slices)){
    p <- slices[[slice]] %>% plot_missing_SN() +
      ggtitle(sprintf('%s - %s', x, slice))
    print(p)
  
    p <- slices[[slice]] %>% plot_missing_SN_per_sample() +
      ggtitle(sprintf('%s - %s', x, slice))
    print(p)
  }
  return(NULL)
})
```
OK, so essentially all the missing values are restricted to PSMs with low (<20) Signal:Noise ratios.

```{r}

source('./R/get_quant_vs_mean.R')


quant_vs_mean <- psm_res %>% lapply(get_quant_vs_mean)

quant_vs_mean <- quant_vs_mean %>% lapply(function(x){
  x %>%
    mutate(binned_interference=Hmisc::cut2(
      Isolation.Interference.in.Percent, cuts=c(0,1,5,10,seq(20,100,20))),
      binned_q=Hmisc::cut2(Percolator.q.Value, cuts=c(0, 0.001, 0.002, 0.005, 0.01)),
      binned_average_sn=Hmisc::cut2(Average.Reporter.SN, cuts=c(0,10,20,30,40,60,100)),
      binned_intensity=Hmisc::cut2(intensity, cuts=c(0,10,20,30,40,60,100)))
})
```


```{r}
quant_vs_mean %>% lapply(dim)
```



```{r}
exp_design <- pData(psm_res$`AGC: 2E5`) %>%
  select(condition, S.cerevisiae=yeast, H.sapiens=human) %>%
  unique()
  
sc_spikes <- exp_design$S.cerevisiae
hs_spikes <- exp_design$H.sapiens

get_ground_truth <- function(sc_spikes, hs_spikes, ix_1, ix_2){
  comparison <- sprintf('%s vs %s', sc_spikes[ix_2], sc_spikes[ix_1])
  hs_ground_truth <- hs_spikes[ix_2]/hs_spikes[ix_1]
  sc_ground_truth <- sc_spikes[ix_2]/sc_spikes[ix_1]
  return(c(comparison, hs_ground_truth, sc_ground_truth))
}
library(gtools)

expected <- apply(permutations(n=3,r=2), 1, function(x){
  get_ground_truth(sc_spikes, hs_spikes, x[1], x[2])
}) %>% t() %>% data.frame() %>%
  setNames(c('comparison', 'H.sapiens', 'S.cerevisiae')) %>%
  mutate_at(vars(S.cerevisiae, 
                 H.sapiens), 
            funs(as.numeric)) %>%
  pivot_longer(-comparison, names_to='species', values_to='expected')

print(expected)

positive_comparisons <- expected %>% filter(species=='S.cerevisiae', expected>1) %>%
  pull(comparison)
```
```{r}
quant_vs_mean %>% names() %>% lapply(function(x){
  for(sp in c("S.cerevisiae", "H.sapiens")){
    p <- quant_vs_mean[[x]] %>%
      filter(species==sp, comparison %in% positive_comparisons) %>%
      ggplot(aes(binned_interference, diff, colour=below_notch)) +
      geom_violin() +
      theme_camprot(base_size=10) +
      theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
      facet_wrap(~comparison, scales='free') +
      geom_hline(aes(yintercept=log2(expected)),
                 data=expected[(expected$species==sp &
                                  expected$comparison %in% positive_comparisons),],
                 colour='grey', linetype=2) +
      xlab('Interference') +
      ylab('Difference in intensity') +
      theme(aspect.ratio=.3) +
      ggtitle(x)
    
    print(p)
  }
  
  p <- quant_vs_mean[[x]] %>%
    filter(species=='S.cerevisiae', Isolation.Interference.in.Percent<=60,
           comparison %in% positive_comparisons) %>%
    ggplot(aes(diff, colour=binned_interference)) +
    geom_density() +
    theme_camprot(base_size=10) +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
    facet_grid(below_notch~comparison, scales='free') +
    geom_vline(aes(xintercept=log2(expected)),
               data=expected[(expected$species=='S.cerevisiae' &
                                expected$comparison %in% positive_comparisons),],
               colour='grey', linetype=2) +
    xlab('Difference in intensity') +
    ylab('Density') +
    xlim(-6,3) +
    theme(aspect.ratio=2) +
    ggtitle(x)
  
  print(p)
})
```
```{r, fig.height=8, fig.width=8, warning=FALSE}
quant_vs_mean %>% names() %>% lapply(function(x){
  p <- quant_vs_mean[[x]] %>%
    filter(Isolation.Interference.in.Percent<=50, # no need to consider interference>=50%
           comparison %in% positive_comparisons) %>% 
    filter(species=='S.cerevisiae', !below_notch) %>%
    ggplot(aes(diff, colour=binned_intensity)) +
    geom_density() +
    theme_camprot(base_size=15) +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
    facet_grid(binned_interference~comparison, scales='free') +
    geom_vline(aes(xintercept=log2(expected)),
               data=expected[(expected$species=='S.cerevisiae' &
                                expected$comparison %in% positive_comparisons),],
               colour=get_cat_palette(1), linetype=2) +
    ylab('Density') +
    xlab('Difference in intensity') +
    xlim(-6,3) +
    ggtitle(x) +
    scale_colour_discrete(name='Intensity')
  
  print(p)
  print(p + aes(colour=binned_interference) + facet_grid(binned_average_sn~comparison) +
    scale_colour_discrete(name='Interference (%)'))
  
  print(p + aes(colour=binned_q) + scale_colour_discrete(name='Percolator Q'))
  print(p + aes(colour=binned_interference) + facet_grid(binned_q~comparison) +
          scale_colour_discrete(name='Interference (%)'))

  
  return(NULL)
})
```

```{r}
quant_vs_mean %>% names() %>% lapply(function(x){
    p <- quant_vs_mean[[x]] %>%
    filter(Isolation.Interference.in.Percent<=60) %>% # no need to consider interference>=60%
    filter(species=='S.cerevisiae', !below_notch, comparison=='6 vs 1') %>%
    group_by(binned_interference, binned_intensity) %>%
    summarise(median_diff=2^median(diff, na.rm=TRUE), n=length(diff)) %>%
    ggplot(aes(binned_interference, binned_intensity, fill=median_diff)) +
    geom_tile(colour='grey') +
    theme_camprot(base_size=15) +
    scale_fill_gradient(high=get_cat_palette(2)[2],
                        low='white',
                        limits=c(0, 6), name='Observed\nfold change') +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
    xlab('Binned interference') +
    ylab('Binned intensity') +
    ggtitle(x)
    
    print(p + geom_text(aes(label=round(median_diff, 1)), size=3))
    
    print(p +
            aes(fill=n) +
            scale_fill_gradient(high=get_cat_palette(3)[3],
                                low='white') +
            geom_text(aes(label=n), size=3) )
})
```

```{r, fig.height=6, fig.width=6}
quant_vs_mean %>% names() %>% lapply(function(x){
    quant_vs_mean[[x]] %>%
    filter(Isolation.Interference.in.Percent<=60) %>% # no need to consider interference>=60%
    filter(species=='S.cerevisiae', !below_notch, comparison=='6 vs 1') %>%
    group_by(binned_q, binned_interference, binned_intensity) %>%
    summarise(median_diff=2^median(diff, na.rm=TRUE)) %>%
    ggplot(aes(binned_interference, binned_intensity, fill=median_diff)) +
    geom_tile(colour='grey') +
    facet_wrap(~binned_q) +
    theme_camprot(base_size=15) +
    scale_fill_gradient2(high=get_cat_palette(2)[2],
                         low=get_cat_palette(2)[1],
                         mid='white', midpoint=0,
                        limits=c(-1, 6), name='Observed\nfold change') +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1),
          panel.background=element_rect(fill="grey")) +
    xlab('Binned interference') +
    ylab('Binned Intensity') +
    ggtitle(x)
})
```

```{r}

quant_vs_mean %>% names() %>% lapply(function(x){
  p <- quant_vs_mean[[x]] %>%
    select(id, species, binned_q, binned_average_sn, binned_interference) %>%
    unique() %>%
    group_by(species, binned_q, binned_average_sn, binned_interference) %>%
    tally() %>%
    ggplot(aes(binned_interference, n)) +
    geom_bar(stat='identity') +
    facet_wrap(~species, scales='free') +
    theme_camprot(base_size=15) +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
    ggtitle(x)
  
  print(p)
  print(p + aes(binned_q))
  print(p + aes(binned_average_sn) +
          xlab('Signal/Noise'))
  
  return(NULL)
})
```


Based on the above, I'm going to use the following thresholds:

- Isolation interference <= 10%
- Percolator Q value <= 0.001

For now, we won't filer using SN since we want to explore the impact of the notch on fold changes in more detail first

First though, let's filter by Percolator Q value or Isolation interence alone and check how isolation interference affects PSM-level fold change estimates.
```{r, fig.height=9, fig.width=9}
quant_vs_mean %>% names() %>% lapply(function(x){
  
  to_plot_q_flt <- quant_vs_mean[[x]] %>%
    filter(Percolator.q.Value<=0.005) %>%
    filter(species=='S.cerevisiae') %>%
    filter(Isolation.Interference.in.Percent<50) %>%
    arrange(Isolation.Interference.in.Percent)
  
  to_plot_interference_flt <- quant_vs_mean[[x]] %>%
    filter(Isolation.Interference.in.Percent<=10) %>%
    filter(species=='S.cerevisiae') %>%
    arrange(Percolator.q.Value)
  
  p <- to_plot_q_flt %>%
    ggplot(aes(log2(intensity), diff)) +
    geom_point(size=0.1, alpha=0.1, colour='grey80') +
    theme_camprot(base_size=12) +
    facet_wrap(~comparison, scales='free_y') +
    geom_hline(aes(yintercept=log2(expected)),
               data=expected[expected$species=='S.cerevisiae',],
               colour='black', linetype=2) +
    xlab('Tag intensity (log2)') +
    ylab('Difference in intensity (log2)') +
    scale_colour_manual(values=c(get_cat_palette(7)),
                        name='Isolation interference (%)') +
    ggtitle(x)
  
  print(p)
  print(p + geom_smooth(aes(colour=binned_interference), se=FALSE, size=0.5))
  print(p %+% to_plot_interference_flt +
          geom_smooth(aes(colour=binned_q), se=FALSE, size=0.5) +
          scale_colour_manual(values=c(get_cat_palette(7)),
                        name='Percolator Q value'))

  return(NULL)
})
```
Let's plot the intensity vs difference in intensity for all comparisons and filtering schemes.
```{r}
quant_vs_mean %>% names() %>% lapply(function(x){
  
    tmp_data <-  quant_vs_mean[[x]] %>%
      filter(species!='mixed', !comparison %in% positive_comparisons)
    
    p <- tmp_data %>%
      ggplot(aes(log2(intensity), diff)) +
      geom_point(size=0.05, alpha=0.05, colour='grey10') +
      geom_density2d(size=0.3, colour=get_cat_palette(2)[2]) +
      theme_camprot(base_size=12) +
      facet_grid(species~comparison, scales='free_y') +
      geom_hline(aes(yintercept=log2(expected)),
                 data=expected[!expected$comparison %in% positive_comparisons,],
                 colour='black', linetype=2) +
      xlab('Tag intensity (log2)') +
      ylab('Difference in intensity (log2)') +
      ggtitle(x) +
      coord_cartesian(ylim=c(-4,4))
  
    print(p)

    print(p %+% (tmp_data %>%
                   filter(Percolator.q.Value<=0.005)))
    print(p %+% (tmp_data %>%
                   filter(Isolation.Interference.in.Percent<=10)))
    print(p %+% (tmp_data %>%
                   filter(Percolator.q.Value<=0.005, Isolation.Interference.in.Percent<=10)))
    print(p %+% (tmp_data %>%
                   filter(Percolator.q.Value<=0.005, Isolation.Interference.in.Percent<=10, Average.Reporter.SN>10)))
    return(NULL)
})
```


Now, let's filter the PSMs against these thresholds.
```{r}
psm_res_flt <- psm_res %>% lapply(function(x){
  out <- filter_TMT_PSMs(x, inter_thresh=20, sn_thresh=0)
  
  out <- out[fData(out)$Percolator.q.Value<=0.001,]
  camprotR:::message_parse(fData(out),
                           'Master.Protein.Accessions',
                           "Percolator Q value filtering")
  out
})

psm_res_flt_sn <- psm_res_flt %>% lapply(function(x){
  out <- filter_TMT_PSMs(x, inter_thresh=20, sn_thresh=10)
  out
})
```

```{r}

psm_res %>% names() %>% lapply(function(x){
  
  datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
  
  for(dataset in names(datasets)){
    
    all <- datasets[[dataset]][[x]]
    hs <- all[fData(all)$species=='H.sapiens']
    sc <- all[fData(all)$species=='S.cerevisiae']
    
    slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
    for(slice in names(slices)){
      p <- slices[[slice]] %>% plot_TMT_notch() +
        ggtitle(sprintf('%s - %s - %s', x, slice, dataset))
      print(p)
    }
  }
  
  return(NULL)
  
})

```

Per-tag notch plots
```{r, fig.height=8, fig.width=8}

psm_res %>% names() %>% lapply(function(x){
  
  datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
  
  for(dataset in names(datasets)){
    
    all <- datasets[[dataset]][[x]]
    hs <- all[fData(all)$species=='H.sapiens']
    sc <- all[fData(all)$species=='S.cerevisiae']
    
    slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
    for(slice in names(slices)){
      p <- slices[[slice]] %>% plot_TMT_notch(facet_by_sample=TRUE) +
        ggtitle(sprintf('%s - %s - %s', x, slice, dataset))
      print(p)
    }
  }
  
  return(NULL)
  
})
```

Tallies for fraction sub-notch PSMs per protein
```{r}

psm_res %>% names() %>% lapply(function(x){
  
  datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
  
  for(dataset in names(datasets)){
    
    all <- datasets[[dataset]][[x]]
    hs <- all[fData(all)$species=='H.sapiens']
    sc <- all[fData(all)$species=='S.cerevisiae']
    
    slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
    for(slice in names(slices)){
      
      notch_per_protein <- get_notch_per_protein(slices[[slice]]) %>%
        filter(fraction_below>0)
      
      p <- plot_fraction_below_notch_per_prot(notch_per_protein) +
        ggtitle(sprintf('%s - %s - %s', x, slice, dataset))
      
      print(p)
      }
  }
  
  return(NULL)
  
})


```


Missing values frequencies.
```{r}
psm_res %>% names() %>% lapply(function(x){
  
  datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
  
  for(dataset in names(datasets)){
    
    all <- datasets[[dataset]][[x]]
    hs <- all[fData(all)$species=='H.sapiens']
    sc <- all[fData(all)$species=='S.cerevisiae']
    
    slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
    for(slice in names(slices)){
      plotNA(slices[[slice]], pNA = 0)
    }
  }
  
  return(NULL)
  
})

```

Save out objects for downstream notebooks
```{r}
saveRDS(quant_vs_mean, '../results/quant_vs_mean.rds')
saveRDS(psm_res_flt, '../results/psm_res_flt.rds')
saveRDS(psm_res_flt_sn, '../results/psm_res_flt_sn.rds')
saveRDS(expected, '../results/expected.rds')
```


